# import dependencies
library(NLP)
library(tm)
library(SnowballC)
# read in csv
beatles <- read.csv("lyrics_beatles.csv", header = TRUE, stringsAsFactors = FALSE)
str(beatles) # view data
'data.frame': 187 obs. of 4 variables:
$ songs_title : chr "I Saw Her Standing There" "Misery" "Anna (Go To Him)" "Chains" ...
$ songs_Writers : chr "Writer(s): JOHN LENNON, PAUL MCCARTNEY" "Writer(s): John Winston Lennon, Paul James Mccartney" "Writer(s): ARTHUR ALEXANDER" "Writer(s): Gerry Goffin, Carole King" ...
$ songs_Song_Lyrics: chr "(1,2,3,4!)\nWell, she was just seventeen\nYou know what I mean\nAnd the way she looked was way beyond compare\n"| __truncated__ "The world is treating me bad... Misery\nI'm the kind of guy\nWho never used to cry\nThe world is treating me ba"| __truncated__ "Anna\nYou come and ask me, girl\nTo set you free, girl\nYou say he loves you more than me\nSo I will set you fr"| __truncated__ "Chains, my baby's got me locked up in chains\nAnd they ain't the kind that you can see\nWhoa, oh, these chains "| __truncated__ ...
$ Year : int 1962 1962 1962 1962 1962 1962 1962 1962 1962 1962 ...
# create corpus of lyrics (column 3)
corpus <- Corpus(VectorSource(beatles[ , 3]))
corpus
<<SimpleCorpus>>
Metadata: corpus specific: 1, document level (indexed): 0
Content: documents: 187
# check corpus
corpus[[1]]
<<PlainTextDocument>>
Metadata: 7
Content: chars: 924
# transform data to lowercase
corpus <- tm_map(corpus, tolower)
transformation drops documents
corpus
<<SimpleCorpus>>
Metadata: corpus specific: 1, document level (indexed): 0
Content: documents: 187
# transform data to remove punctuation and numbers
corpus <- tm_map(corpus, removePunctuation)
transformation drops documents
corpus <- tm_map(corpus, removeNumbers)
transformation drops documents
# remove english stopwords and complete stemming
corpus <- tm_map(corpus, removeWords, stopwords("english"))
transformation drops documents
corpus <- tm_map(corpus, stemDocument)
transformation drops documents
dtm <- DocumentTermMatrix(corpus) # data to 'data term matrix'
dtm
<<DocumentTermMatrix (documents: 187, terms: 1719)>>
Non-/sparse entries: 7000/314453
Sparsity : 98%
Maximal term length: 17
Weighting : term frequency (tf)
# Questions 2
# * 187 is the number of entries (in this case songs)
# * 1719 is the number terms found at least once in the 187 entries
# * Non-/spares entries reads like this: 314453 cells in frequencies are 0, 7000 have non-zero values.
# * 98% of all cells are zero (which is 314453/(314453+7000))
dtm_99 <- removeSparseTerms(dtm, 0.99)
dtm_99
<<DocumentTermMatrix (documents: 187, terms: 776)>>
Non-/sparse entries: 6057/139055
Sparsity : 96%
Maximal term length: 16
Weighting : term frequency (tf)
dtm_98 <- removeSparseTerms(dtm, 0.98)
dtm_98
<<DocumentTermMatrix (documents: 187, terms: 412)>>
Non-/sparse entries: 5225/71819
Sparsity : 93%
Maximal term length: 10
Weighting : term frequency (tf)
dtm_97 <- removeSparseTerms(dtm, 0.97)
dtm_97
<<DocumentTermMatrix (documents: 187, terms: 274)>>
Non-/sparse entries: 4621/46617
Sparsity : 91%
Maximal term length: 10
Weighting : term frequency (tf)
dtm_96 <- removeSparseTerms(dtm, 0.96)
dtm_96
<<DocumentTermMatrix (documents: 187, terms: 219)>>
Non-/sparse entries: 4267/36686
Sparsity : 90%
Maximal term length: 10
Weighting : term frequency (tf)
# Question 4
# * As % sparsity decreases, the dtm becomes smaller
dtm.beatles <- removeSparseTerms(dtm, 0.90)
dtm.beatles
<<DocumentTermMatrix (documents: 187, terms: 72)>>
Non-/sparse entries: 2515/10949
Sparsity : 81%
Maximal term length: 7
Weighting : term frequency (tf)
# turn dtm into dataframe
beatles.lyrics <- as.data.frame(as.matrix(dtm.beatles))
head(beatles.lyrics)
# determine frequency of terms by summing columns
freq.dtm <- sort(colSums(beatles.lyrics), decreasing=TRUE)
# create a dataframe with frequency data
freq.data <- data.frame(word = names(freq.dtm), freq=freq.dtm)
# plot frequencies
library(ggplot2)
Attaching package: ‘ggplot2’
The following object is masked from ‘package:NLP’:
annotate
freq.plot <- ggplot(freq.data, aes(reorder(word, freq), freq)) + geom_col() +
xlab(NULL) + coord_flip() + ylab("Frequency")+
theme(text = element_text(size = 8))
print(freq.plot)

# create dynamical heatmap
library(qgraph)
library(plotly)
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
cor.terms <- cor_auto(beatles.lyrics)
Variables detected as ordinal: heart; ooh; way; alway; thing; world; leav; want; around; away; take; blue; show; said; there; tri; word; hear; though; yes; head; mind; eye; that; right; sing; wait; youv; everyth; find; turn
Correlation matrix is not positive definite. Finding nearest positive definite matrix
a <- list(showticklabels = TRUE, tickangle = -45)
plot.cor <- plot_ly(x = colnames(cor.terms), y = colnames(cor.terms),
z = cor.terms, type = "heatmap") %>%
layout(xaxis = a, showlegend = FALSE, margin = list(l=100,b=100,r=100,u=100))
print(plot.cor)
NULL
# Question 7.1: highest correlated terms are 'love' and
LS0tCnRpdGxlOiAiVGV4dCBNaW5pbmcgQXNzaWdubWVudCAxIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpgYGB7cn0KIyBpbXBvcnQgZGVwZW5kZW5jaWVzCmxpYnJhcnkoTkxQKQpsaWJyYXJ5KHRtKQpsaWJyYXJ5KFNub3diYWxsQykKYGBgCgpgYGB7cn0KIyByZWFkIGluIGNzdgpiZWF0bGVzIDwtIHJlYWQuY3N2KCJseXJpY3NfYmVhdGxlcy5jc3YiLCBoZWFkZXIgPSBUUlVFLCBzdHJpbmdzQXNGYWN0b3JzID0gRkFMU0UpCnN0cihiZWF0bGVzKSAjIHZpZXcgZGF0YQpgYGAKCmBgYHtyfQojIGNyZWF0ZSBjb3JwdXMgb2YgbHlyaWNzIChjb2x1bW4gMykKY29ycHVzIDwtIENvcnB1cyhWZWN0b3JTb3VyY2UoYmVhdGxlc1sgLCAzXSkpCmNvcnB1cwpgYGAKCmBgYHtyfQojIGNoZWNrIGNvcnB1cwpjb3JwdXNbWzFdXQpgYGAKCmBgYHtyfQojIHRyYW5zZm9ybSBkYXRhIHRvIGxvd2VyY2FzZQpjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cywgdG9sb3dlcikKY29ycHVzCmBgYAoKYGBge3J9CiMgdHJhbnNmb3JtIGRhdGEgdG8gcmVtb3ZlIHB1bmN0dWF0aW9uIGFuZCBudW1iZXJzCmNvcnB1cyA8LSB0bV9tYXAoY29ycHVzLCByZW1vdmVQdW5jdHVhdGlvbikKY29ycHVzIDwtIHRtX21hcChjb3JwdXMsIHJlbW92ZU51bWJlcnMpCmBgYAoKYGBge3J9CiMgcmVtb3ZlIGVuZ2xpc2ggc3RvcHdvcmRzIGFuZCBjb21wbGV0ZSBzdGVtbWluZwpjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cywgcmVtb3ZlV29yZHMsIHN0b3B3b3JkcygiZW5nbGlzaCIpKQpjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cywgc3RlbURvY3VtZW50KQpkdG0gPC0gRG9jdW1lbnRUZXJtTWF0cml4KGNvcnB1cykgIyBkYXRhIHRvICdkb2N1bWVudCB0ZXJtIG1hdHJpeCcKZHRtCmBgYApgYGB7cn0KIyBRdWVzdGlvbnMgMgojICogMTg3IGlzIHRoZSBudW1iZXIgb2YgZW50cmllcyAoaW4gdGhpcyBjYXNlIHNvbmdzKQojICogMTcxOSBpcyB0aGUgbnVtYmVyIHRlcm1zIGZvdW5kIGF0IGxlYXN0IG9uY2UgaW4gdGhlIDE4NyBlbnRyaWVzCiMgKiBOb24tL3NwYXJlcyBlbnRyaWVzIHJlYWRzIGxpa2UgdGhpczogMzE0NDUzIGNlbGxzIGluIGZyZXF1ZW5jaWVzIGFyZSAwLCA3MDAwIGhhdmUgbm9uLXplcm8gdmFsdWVzLiAKIyAqIDk4JSBvZiBhbGwgY2VsbHMgYXJlIHplcm8gKHdoaWNoIGlzIDMxNDQ1My8oMzE0NDUzKzcwMDApKQpgYGAKCmBgYHtyfQpkdG1fOTkgPC0gcmVtb3ZlU3BhcnNlVGVybXMoZHRtLCAwLjk5KQpkdG1fOTkKYGBgCmBgYHtyfQpkdG1fOTggPC0gcmVtb3ZlU3BhcnNlVGVybXMoZHRtLCAwLjk4KQpkdG1fOTgKYGBgCgpgYGB7cn0KZHRtXzk3IDwtIHJlbW92ZVNwYXJzZVRlcm1zKGR0bSwgMC45NykKZHRtXzk3CmBgYAoKYGBge3J9CmR0bV85NiA8LSByZW1vdmVTcGFyc2VUZXJtcyhkdG0sIDAuOTYpCmR0bV85NgpgYGAKCmBgYHtyfQojIFF1ZXN0aW9uIDQKIyAqIEFzICUgc3BhcnNpdHkgZGVjcmVhc2VzLCB0aGUgZHRtIGJlY29tZXMgc21hbGxlcgpgYGAKCmBgYHtyfQpkdG0uYmVhdGxlcyA8LSByZW1vdmVTcGFyc2VUZXJtcyhkdG0sIDAuOTApCmR0bS5iZWF0bGVzCgojIFF1ZXN0aW9uIDUKIyAqIGR0bS5iZWF0bGVzIGNvbnRhaW5zIDcyIHRlcm1zCmBgYAoKYGBge3J9CiMgdHVybiBkdG0gaW50byBkYXRhZnJhbWUKYmVhdGxlcy5seXJpY3MgPC0gYXMuZGF0YS5mcmFtZShhcy5tYXRyaXgoZHRtLmJlYXRsZXMpKQpoZWFkKGJlYXRsZXMubHlyaWNzKQpgYGAKCmBgYHtyfQojIGRldGVybWluZSBmcmVxdWVuY3kgb2YgdGVybXMgYnkgc3VtbWluZyBjb2x1bW5zCmZyZXEuZHRtIDwtIHNvcnQoY29sU3VtcyhiZWF0bGVzLmx5cmljcyksIGRlY3JlYXNpbmc9VFJVRSkKYGBgCgpgYGB7cn0KIyBjcmVhdGUgYSBkYXRhZnJhbWUgd2l0aCBmcmVxdWVuY3kgZGF0YQpmcmVxLmRhdGEgPC0gZGF0YS5mcmFtZSh3b3JkID0gbmFtZXMoZnJlcS5kdG0pLCBmcmVxPWZyZXEuZHRtKQpgYGAKCmBgYHtyfQojIHBsb3QgZnJlcXVlbmNpZXMKCmxpYnJhcnkoZ2dwbG90MikKZnJlcS5wbG90IDwtIGdncGxvdChmcmVxLmRhdGEsIGFlcyhyZW9yZGVyKHdvcmQsIGZyZXEpLCBmcmVxKSkgKyBnZW9tX2NvbCgpICsgCiAgICB4bGFiKE5VTEwpICsgY29vcmRfZmxpcCgpICsgeWxhYigiRnJlcXVlbmN5IikrCiAgICB0aGVtZSh0ZXh0ID0gZWxlbWVudF90ZXh0KHNpemUgPSA4KSkKcHJpbnQoZnJlcS5wbG90KQoKIyBRdWVzdGlvbnMgCiMgNi4xOiB0b3AgKDMpIHRlcm1zIGFyZSAnbG92ZScsICdrbm93JywgJ2Rvbid0JwojIDYuMjogYm90dG9tICgzKSB0ZXJtcyBhcmUgJ2hlYXInLCAnc2hvdycsICd0dXJuJyAKYGBgCgpgYGB7cn0KIyBjcmVhdGUgZHluYW1pY2FsIGhlYXRtYXAKbGlicmFyeShxZ3JhcGgpCmxpYnJhcnkocGxvdGx5KQpsaWJyYXJ5KGRwbHlyKQpjb3IudGVybXMgPC0gY29yX2F1dG8oYmVhdGxlcy5seXJpY3MpCmEgPC0gbGlzdChzaG93dGlja2xhYmVscyA9IFRSVUUsIHRpY2thbmdsZSA9IC00NSkKcGxvdC5jb3IgPC0gcGxvdF9seSh4ID0gY29sbmFtZXMoY29yLnRlcm1zKSwgeSA9IGNvbG5hbWVzKGNvci50ZXJtcyksCiAgICAgICAgICAgICAgICAgICAgICB6ID0gY29yLnRlcm1zLCB0eXBlID0gImhlYXRtYXAiKSAlPiUKICAgIGxheW91dCh4YXhpcyA9IGEsICBzaG93bGVnZW5kID0gRkFMU0UsIG1hcmdpbiA9IGxpc3QobD0xMDAsYj0xMDAscj0xMDAsdT0xMDApKQpwcmludChwbG90LmNvcikKCiMgUXVlc3Rpb24gNy4xOiBoaWdoZXN0IGNvcnJlbGF0ZWQgdGVybXMgYXJlICdsb3ZlJyBhbmQgJ25lZWQnIApgYGAKCgo=